From 00c23a133da89924b3b746260cadafcc8d2a9d20 Mon Sep 17 00:00:00 2001
From: tsteven4 <13596209+tsteven4@users.noreply.github.com>
Date: Sat, 29 Jun 2024 12:49:56 -0600
Subject: [PATCH] Fix unicsv reader delimiter detection (#1287)
* Fix unicsv reader delimiter detection.
Any delimiter sequence that is enclosed has never been considered
a delimiter when splitting a line into fields. However, previously
an enclosed potential delimiter could be detected when scanning the
first line, and then erroneously used as the delimiter.
* tweak auto
* add unicsv delimiter detection test
---
csv_util.cc | 9 +++++++-
csv_util.h | 3 ++-
reference/unidelim.csv | 2 ++
reference/unidelim.gpx | 10 +++++++++
testo.d/unicsv.test | 5 +++++
unicsv.cc | 43 ++++++++++++++++++++++++++-------------
unicsv.h | 3 ++-
xmldoc/formats/unicsv.xml | 10 ++++++---
8 files changed, 65 insertions(+), 20 deletions(-)
create mode 100644 reference/unidelim.csv
create mode 100644 reference/unidelim.gpx
diff --git a/csv_util.cc b/csv_util.cc
index 1a7ad8420..0fde1d128 100644
--- a/csv_util.cc
+++ b/csv_util.cc
@@ -148,7 +148,8 @@ csv_dequote(const QString& string, const QString& enclosure)
/*****************************************************************************/
QStringList
csv_linesplit(const QString& string, const QString& delimited_by,
- const QString& enclosed_in, const int line_no, CsvQuoteMethod method)
+ const QString& enclosed_in, const int line_no, CsvQuoteMethod method,
+ bool* delimiter_detected)
{
QStringList retval;
@@ -162,6 +163,7 @@ csv_linesplit(const QString& string, const QString& delimited_by,
* whitespace eater consume the space.
*/
QString delimiter = delimited_by;
+ bool delimiter_seen = false;
if (delimited_by == ", ") {
delimiter = ",";
}
@@ -194,8 +196,10 @@ csv_linesplit(const QString& string, const QString& delimited_by,
if (!enclosed) {
if ((dlen > 0) && string.mid(p).startsWith(delimiter)) {
dfound = true;
+ delimiter_seen = true;
} else if (hyper_whitespace_delimiter && string.at(p).isSpace()) {
dfound = true;
+ delimiter_seen = true;
while ((p < string.size()) && string.at(p).isSpace()) {
p++;
}
@@ -235,6 +239,9 @@ csv_linesplit(const QString& string, const QString& delimited_by,
retval.append(value);
}
+ if (delimiter_detected != nullptr) {
+ *delimiter_detected = delimiter_seen;
+ }
return retval;
}
/*****************************************************************************/
diff --git a/csv_util.h b/csv_util.h
index 149b4ff00..3a9f96185 100644
--- a/csv_util.h
+++ b/csv_util.h
@@ -43,7 +43,8 @@ enum class CsvQuoteMethod {historic, rfc4180};
QStringList
csv_linesplit(const QString& string, const QString& delimited_by,
- const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic);
+ const QString& enclosed_in, int line_no, CsvQuoteMethod method = CsvQuoteMethod::historic,
+ bool* delimiter_detected = nullptr);
int
dec_to_intdeg(double d);
diff --git a/reference/unidelim.csv b/reference/unidelim.csv
new file mode 100644
index 000000000..370084bd5
--- /dev/null
+++ b/reference/unidelim.csv
@@ -0,0 +1,2 @@
+lat,lon,"foo; bar;","bam wham",name,desc,"zoom|zap",notes
+41.90270080,12.49623520,this,that,"Roma, ð®ð¹","my ""roam'n"" holiday",the other thing,fun
diff --git a/reference/unidelim.gpx b/reference/unidelim.gpx
new file mode 100644
index 000000000..b5812dd96
--- /dev/null
+++ b/reference/unidelim.gpx
@@ -0,0 +1,10 @@
+
+
+
+
+
+ Roma, ð®ð¹
+ my "roam'n" holiday
+ fun
+
+
diff --git a/testo.d/unicsv.test b/testo.d/unicsv.test
index 6c31bba94..99bfdc681 100644
--- a/testo.d/unicsv.test
+++ b/testo.d/unicsv.test
@@ -54,3 +54,8 @@ gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=1 -F ${TMPDI
compare ${REFERENCE}/pretty_degree1.csv ${TMPDIR}/pretty_degree1.csv
gpsbabel -i unicsv -f ${REFERENCE}/pretty_degree.csv -o unicsv,grid=2 -F ${TMPDIR}/pretty_degree2.csv
compare ${REFERENCE}/pretty_degree2.csv ${TMPDIR}/pretty_degree2.csv
+
+# delimiter detection
+gpsbabel -i unicsv -f ${REFERENCE}/unidelim.csv -o gpx -F ${TMPDIR}/unidelim.gpx
+compare ${REFERENCE}/unidelim.gpx ${TMPDIR}/unidelim.gpx
+
diff --git a/unicsv.cc b/unicsv.cc
index 934df3866..642af6262 100644
--- a/unicsv.cc
+++ b/unicsv.cc
@@ -385,19 +385,27 @@ void
UnicsvFormat::unicsv_fondle_header(QString header)
{
/* Convert the entire header to lower case for convenience.
- * If we see a tab in that header, we decree it to be tabsep.
*/
- unicsv_fieldsep = ",";
- if (header.contains('\t')) {
- unicsv_fieldsep = "\t";
- } else if (header.contains(';')) {
- unicsv_fieldsep = ";";
- } else if (header.contains('|')) {
- unicsv_fieldsep = "|";
- }
header = header.toLower();
- const QStringList values = csv_linesplit(header, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
+ /* Find the separator and split the line into fields.
+ * If we see an unenclosd tab that is the separator.
+ * Otherwise, if we see an unenclosed semicolon that is the separator.
+ * Otherwise, if we see an unenclosed vertical bar that is the separator.
+ * Otherwise the separator is a comma.
+ */
+ const QList delimiters = {"\t", ";", "|", ","};
+ unicsv_fieldsep = delimiters.last();
+ QStringList values;
+ bool delimiter_detected;
+ for (const auto* delimiter : delimiters) {
+ values = csv_linesplit(header, delimiter, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180, &delimiter_detected);
+ if (delimiter_detected) {
+ unicsv_fieldsep = delimiter;
+ break;
+ }
+ }
+
for (auto value : values) {
value = value.trimmed();
@@ -411,8 +419,12 @@ UnicsvFormat::unicsv_fondle_header(QString header)
}
f++;
}
- if ((f->name.isEmpty()) && global_opts.debug_level) {
- warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
+ if (global_opts.debug_level) {
+ if ((f->name.isEmpty()) && global_opts.debug_level) {
+ warning(MYNAME ": Unhandled column \"%s\".\n", qPrintable(value));
+ } else {
+ warning(MYNAME ": Interpreting column \"%s\" as %s(%d).\n", qPrintable(value), qPrintable(f->name), f->type);
+ }
}
/* handle some special items */
@@ -456,10 +468,12 @@ UnicsvFormat::rd_init(const QString& fname)
fin = new gpsbabel::TextStream;
fin->open(fname, QIODevice::ReadOnly, MYNAME, opt_codec);
+ unicsv_lineno = 0;
if (opt_fields) {
QString fields = QString(opt_fields).replace("+", ",");
unicsv_fondle_header(fields);
- } else if (buff = fin->readLine(), !buff.isNull()) {
+ } else if (buff = fin->readLine(); !buff.isNull()) {
+ ++unicsv_lineno;
unicsv_fondle_header(buff);
} else {
unicsv_fieldsep = nullptr;
@@ -508,7 +522,7 @@ UnicsvFormat::unicsv_parse_one_line(const QString& ibuf)
wpt->longitude = kUnicsvUnknown;
int column = -1;
- const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, "\"", 0, CsvQuoteMethod::rfc4180);
+ const QStringList values = csv_linesplit(ibuf, unicsv_fieldsep, kUnicsvQuoteChar, unicsv_lineno, CsvQuoteMethod::rfc4180);
for (auto value : values) {
if (++column >= unicsv_fields_tab.size()) {
break; /* ignore extra fields on line */
@@ -1060,6 +1074,7 @@ UnicsvFormat::read()
}
while ((buff = fin->readLine(), !buff.isNull())) {
+ ++unicsv_lineno;
buff = buff.trimmed();
if (buff.isEmpty() || buff.startsWith('#')) {
continue;
diff --git a/unicsv.h b/unicsv.h
index a8c86664e..a775e4729 100644
--- a/unicsv.h
+++ b/unicsv.h
@@ -148,7 +148,7 @@ private:
/* Constants */
- /* "UNICSV_FIELD_SEP" and "UNICSV_LINE_SEP" are only used by the writer */
+ /* "kUnicsvFieldSep" and "kUnicsvLineSep" are only used by the writer */
static constexpr const char* kUnicsvFieldSep = ",";
static constexpr const char* kUnicsvLineSep = "\r\n";
@@ -189,6 +189,7 @@ private:
double unicsv_depthscale{};
double unicsv_proximityscale{};
const char* unicsv_fieldsep{nullptr};
+ int unicsv_lineno{0};
gpsbabel::TextStream* fin{nullptr};
gpsbabel::TextStream* fout{nullptr};
gpsdata_type unicsv_data_type{unknown_gpsdata};
diff --git a/xmldoc/formats/unicsv.xml b/xmldoc/formats/unicsv.xml
index 297a81a98..d75579c74 100644
--- a/xmldoc/formats/unicsv.xml
+++ b/xmldoc/formats/unicsv.xml
@@ -4,9 +4,13 @@
figure out what data it has and writes headers and all the data it can.
- If the first line contains any tabs, the data lines are assumed
- to be tab separated. Otherwise the fields are assumed to be
- separated by commas.
+ Fields may be enclosed in double quotes. To include a double quote inside quotes escape it with another double quote.
+
+
+ If the first line contains any unenclosed tabs then the data lines are assumed to be tab separated.
+ Otherwise if the first line contains any unenclosed semicolons then fields are assumed to be separated by semicolons.
+ Otherwise if the first line contains any unenclosed vertical bars then fields are assumed to be separated by vertical bars.
+ Otherwise the fields are assumed to be separated by commas.
The list of keywords include:
--
2.30.2